In the original dataset, NA value is labeled as “..”.
ClimateRaw1 = readxl::read_xls("Climate_Change.xls",sheet = 1,na = "..",col_types = "guess")
ClimateRaw2 = readxl::read_xls("Climate_Change.xls",sheet = 2,na = "..",col_types = "guess")
ClimateRaw3 = readxl::read_xls("Climate_Change.xls",sheet = 3,na = "..",col_types = "guess")
When cleaning the datatable, there are several country code which is not actually country. Such as LIC represent low income countries and these rows represent an aggregated data. In order to separate these when plotting, we need to save these country codes for later use.
Pull out region and income codes
Regioncode = dplyr::filter(ClimateRaw2,Region == "Aggregates")[-grep("income",ClimateRaw2$`Country name`),] # Both the Income and Region aggregated rows will have a "Aggregates" value in the Region column
Regioncode = Regioncode %>% select(`Country code`) %>% pull()
Incomecode = dplyr::filter(ClimateRaw2,Region == "Aggregates")[grep("income",ClimateRaw2$`Country name`),]
Incomecode = Incomecode %>% select(`Country code`)%>%pull()
There are 57 different variable observed for each country every year from 1990 to 2011(Ofcourse there are NA values,actually quite a lot. some is just missing, and some is measured every 10 years instead of every year.). After carefully examined 3rd sheet in the raw data, 9 variables are kept, they are:
Relatedvariables = dplyr::select(ClimateRaw3,`Series code`) %>%
slice(1:4,14:15,41:43)%>%
pull()
print(Relatedvariables)
## [1] "SP.POP.TOTL" "SP.POP.GROW" "NY.GDP.MKTP.CD"
## [4] "NY.GNP.PCAP.CD" "SP.URB.TOTL" "SP.URB.GROW"
## [7] "EN.ATM.CO2E.PC" "EN.ATM.CO2E.PP.GD.KD" "EN.ATM.CO2E.KT"
SP.POP.TOTLSP.POP.GROWNY.GDP.MKTP.CDNY.GNI.PCAP.CDSP.URB.TOTLSP.URB.GROWEN.ATM.CO2E.PCEN.ATM.CO2E.PP.GD.KDEN.ATM.CO2E.KT Using the codes to filter original Climate data
Climate <- dplyr::filter(ClimateRaw1,`Series code` %in% Relatedvariables)
print(summary(as.factor(Climate$`Series code`)))
## EN.ATM.CO2E.KT EN.ATM.CO2E.PC EN.ATM.CO2E.PP.GD.KD
## 233 233 233
## NY.GDP.MKTP.CD NY.GNP.PCAP.CD SP.POP.GROW
## 233 233 232
## SP.POP.TOTL SP.URB.GROW SP.URB.TOTL
## 233 232 233
Because the second sheet of the original data contains categorical data indicating a country’s income status and geographic info. Assign these to the data table using dplyr::left_join, and also spread the table to become longer.
Climate <- left_join(Climate,select(ClimateRaw2,`Country code`,`Income group`,Region)) %>%
melt(measure.vars = 7:28)%>%
dplyr::rename(year = variable)
kable(Climate%>%dplyr::sample_n(10,replace = FALSE))%>%
kable_styling() %>%
scroll_box(width = "100%", height = "400px")
| Country code | Country name | Series code | Series name | SCALE | Decimals | Income group | Region | year | value |
|---|---|---|---|---|---|---|---|---|---|
| PYF | French Polynesia | SP.URB.GROW | Urban population growth (annual %) | 0 | 1 | High income: nonOECD | East Asia & Pacific | 1996 | 1.594565e+00 |
| WLD | World | SP.URB.TOTL | Urban population | 0 | 0 | Aggregates | Aggregates | 2003 | 3.010250e+09 |
| GRC | Greece | SP.POP.GROW | Population growth (annual %) | 0 | 1 | High income: OECD | Europe & Central Asia | 2009 | 4.055627e-01 |
| IDN | Indonesia | SP.POP.GROW | Population growth (annual %) | 0 | 1 | Lower middle income | East Asia & Pacific | 2001 | 1.307325e+00 |
| SAS | South Asia | EN.ATM.CO2E.KT | CO2 emissions, total (KtCO2) | 0 | 1 | Aggregates | Aggregates | 1991 | 8.299118e+05 |
| COG | Congo, Rep. | EN.ATM.CO2E.PC | CO2 emissions per capita (metric tons) | 0 | 1 | Lower middle income | Sub-Saharan Africa | 1993 | 5.964629e-01 |
| BEN | Benin | EN.ATM.CO2E.KT | CO2 emissions, total (KtCO2) | 0 | 1 | Low income | Sub-Saharan Africa | 2005 | 2.566900e+03 |
| TMP | Timor-Leste | SP.POP.TOTL | Population | 0 | 0 | Lower middle income | East Asia & Pacific | 2008 | 1.079749e+06 |
| BHR | Bahrain | SP.URB.TOTL | Urban population | 0 | 0 | High income: nonOECD | Middle East & North Africa | 2011 | NA |
| TUV | Tuvalu | EN.ATM.CO2E.PP.GD.KD | CO2 emissions per units of GDP (kg/$1,000 of 2005 PPP $) | 0 | 1 | Lower middle income | East Asia & Pacific | 2007 | NA |
Get Series code table, Income code table as well as Region table
SeriesCode = ClimateRaw3%>%
select(`Series code`,`Series name`,Definition)%>%
dplyr::filter(`Series code` %in% Relatedvariables)
IncomeTable = ClimateRaw2%>%
select(`Country code`,`Country name`)%>%
filter(`Country code` %in% Incomecode)
IncomeTable = IncomeTable[c(1,6,5,3,2,4),]# re-order it
RegionTable = ClimateRaw2%>%
select(`Country code`,`Country name`)%>%
filter(`Country code` %in% Regioncode)
Check NA values in world data WLD, Region data and also Income group data
Climate%>%dplyr::filter(`Country code`=="WLD",is.na(value))%>%
dplyr::group_by(year)%>%
dplyr::select(`Series code`,year,value)%>%
summary()
## Series code year value
## Length:18 2011 :9 Min. : NA
## Class :character 2008 :3 1st Qu.: NA
## Mode :character 2009 :3 Median : NA
## 2010 :3 Mean :NaN
## 1990 :0 3rd Qu.: NA
## 1991 :0 Max. : NA
## (Other):0 NA's :18
Climate%>%dplyr::filter(`Country code`%in% Incomecode,is.na(value))%>%
dplyr::group_by(year)%>%
dplyr::select(`Series code`,year,value)%>%
summary()
## Series code year value
## Length:90 2011 :54 Min. : NA
## Class :character 2009 :18 1st Qu.: NA
## Mode :character 2010 :18 Median : NA
## 1990 : 0 Mean :NaN
## 1991 : 0 3rd Qu.: NA
## 1992 : 0 Max. : NA
## (Other): 0 NA's :90
Climate%>%dplyr::filter(`Country code`%in% Regioncode,is.na(value))%>%
dplyr::group_by(year)%>%
dplyr::select(`Series code`,year,value)%>%
summary()
## Series code year value
## Length:147 2011 :79 Min. : NA
## Class :character 2010 :30 1st Qu.: NA
## Mode :character 2009 :28 Median : NA
## 1990 : 4 Mean :NaN
## 1991 : 3 3rd Qu.: NA
## 2008 : 3 Max. : NA
## (Other): 0 NA's :147
As shown above, 2011 is missing too many datas, year 2011 will be excluded from the Dataset
Climate <- Climate%>%dplyr::filter(year != 2011)
check again
Climate%>%dplyr::filter(`Country code`=="WLD",is.na(value))%>%
dplyr::group_by(year)%>%
dplyr::select(`Series code`,year,value)%>%
summary()
## Series code year value
## Length:9 2008 :3 Min. : NA
## Class :character 2009 :3 1st Qu.: NA
## Mode :character 2010 :3 Median : NA
## 1990 :0 Mean :NaN
## 1991 :0 3rd Qu.: NA
## 1992 :0 Max. : NA
## (Other):0 NA's :9
Climate%>%dplyr::filter(`Country code`%in% Incomecode,is.na(value))%>%
dplyr::group_by(year)%>%
dplyr::select(`Series code`,year,value)%>%
summary()
## Series code year value
## Length:36 2009 :18 Min. : NA
## Class :character 2010 :18 1st Qu.: NA
## Mode :character 1990 : 0 Median : NA
## 1991 : 0 Mean :NaN
## 1992 : 0 3rd Qu.: NA
## 1993 : 0 Max. : NA
## (Other): 0 NA's :36
Climate%>%dplyr::filter(`Country code`%in% Regioncode,is.na(value))%>%
dplyr::group_by(year)%>%
dplyr::select(`Series code`,year,value)%>%
summary()
## Series code year value
## Length:68 2010 :30 Min. : NA
## Class :character 2009 :28 1st Qu.: NA
## Mode :character 1990 : 4 Median : NA
## 1991 : 3 Mean :NaN
## 2008 : 3 3rd Qu.: NA
## 1992 : 0 Max. : NA
## (Other): 0 NA's :68
Plot our data to have a taste of what it looks like
p <- ggplot(Climate%>%dplyr::filter(`Series code` == Relatedvariables[8],`Country code` %in% Regioncode))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+xlab("Year")+
ggtitle(Relatedvariables[8])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -.3))
dplyr::filter(ClimateRaw2,Region != "Aggregates")%>%select(Region)%>%pull()%>%unique()
## [1] "Latin America & Caribbean" "Europe & Central Asia"
## [3] "South Asia" "Sub-Saharan Africa"
## [5] "Middle East & North Africa" "East Asia & Pacific"
## [7] "North America"
From the dataset 3 countries are labeled in North America region: USA, CAN and BMU. Below are the reconstucting process.
# Check if NAS has been occupied
Climate%>%dplyr::filter(`Country code`=="NAS")
## [1] Country code Country name Series code Series name SCALE
## [6] Decimals Income group Region year value
## <0 rows> (or 0-length row.names)
# Checked
Calculate Total CO2 emission, Total Population, GDP and Urban population
# Total CO2 emission
D1 <- Climate%>%
dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Relatedvariables[9])%>%
dplyr::select(1:8)%>%
dplyr::slice(1)
D1 <- D1%>%as.data.frame()
D1[1,c(1:2,7:8)] = c("NAS","North America","Aggregates","Aggregates")
D2 = Climate%>%
dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Relatedvariables[9])%>%summarise_at(.vars = 9:29,.funs = sum)
D = dplyr::bind_cols(D1,D2)
# Total Population
D1 <- Climate%>%
dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Relatedvariables[1])%>%
dplyr::select(1:8)%>%
dplyr::slice(1)
D1 <- D1%>%as.data.frame()
D1[1,c(1:2,7:8)] = c("NAS","North America","Aggregates","Aggregates")
D2 = Climate%>%
dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Relatedvariables[1])%>%
summarise_at(.vars = 9:29,.funs = sum)
D <- dplyr::bind_rows(D, dplyr::bind_cols(D1,D2))
# GDP
D1 <- Climate%>%
dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Relatedvariables[3])%>%
dplyr::select(1:8)%>%
dplyr::slice(1)
D1 <- D1%>%as.data.frame()
D1[1,c(1:2,7:8)] = c("NAS","North America","Aggregates","Aggregates")
D2 = Climate%>%
dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Relatedvariables[3])%>%
summarise_at(.vars = 9:29,.funs = sum)
D <- dplyr::bind_rows(D, dplyr::bind_cols(D1,D2))
# Urban Population
D1 <- Climate%>%dplyr::filter(Region == "North America")%>%pivot_wider(names_from = year,values_from = value)%>%dplyr::filter(`Series code` == Relatedvariables[5])%>%dplyr::select(1:8)%>%dplyr::slice(1)
D1 <- D1%>%as.data.frame()
D1[1,c(1:2,7:8)] = c("NAS","North America","Aggregates","Aggregates")
D2 = Climate%>%dplyr::filter(Region == "North America")%>%pivot_wider(names_from = year,values_from = value)%>%dplyr::filter(`Series code` == Relatedvariables[5])%>%summarise_at(.vars = 9:29,.funs = sum)
D <- dplyr::bind_rows(D, dplyr::bind_cols(D1,D2))
Calculate CO2 per capita, CO2 per unit PPP Bermuda’s PPP factor is missing from the world bank data, the cloest one I could find is a data given by federal reserve bank
pppBer <- 1.98021
pppCan <- 1.21
D_Temp <- D%>%dplyr::select(`Series name`,`1990`:`2010`)%>%
column_to_rownames(var = "Series name")%>%
transpose_df()
Temp_Colnames <- colnames(D_Temp)
colnames(D_Temp) <- c("year","CO2","Pop","GDP","Urban")
# Calculate ppp in 2005 for North america
ppp2005 <- Climate%>%dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Relatedvariables[3])%>%
dplyr::select(`Country code`,`2005`)%>%
column_to_rownames(var = "Country code")%>%
transpose_df()%>%dplyr::transmute(`North America` = BMU*pppBer + CAN * pppCan + USA)%>%
pull()
D_Temp = D_Temp%>%dplyr::mutate(CO2percap = CO2*1000/Pop,CO2perGDP = CO2*1000000000/ppp2005)
Adjust comparing variables exclude Pop growth and Urban growth and GNI per cap. (For simplicity)
SeriesCode <- SeriesCode%>%slice(-2,-4,-6)
# update related Variables
Related_variables <- SeriesCode%>%
dplyr::select(`Series code`)%>%
pull()
Temp_Colnames <- SeriesCode%>%
dplyr::select(`Series name`)%>%
pull()
#set adequate colname order and rename the D_Temp cols
colnames(D_Temp) <- c("year",Temp_Colnames[c(6,1:5)])
# Transpose back
D_Temp <- D_Temp%>%column_to_rownames(var = "year")%>%transpose_df()
colnames(D_Temp)[colnames(D_Temp)=="rowname"] <- "Series name"
print(colnames(D_Temp))
## [1] "Series name" "1990" "1991" "1992" "1993"
## [6] "1994" "1995" "1996" "1997" "1998"
## [11] "1999" "2000" "2001" "2002" "2003"
## [16] "2004" "2005" "2006" "2007" "2008"
## [21] "2009" "2010"
Reconstruct the remaining part of climate data
D_ = data.frame()
for (i in 1:length(Related_variables)){
D1 <- Climate%>%
dplyr::filter(Region == "North America")%>%
pivot_wider(names_from = year,values_from = value)%>%
dplyr::filter(`Series code` == Related_variables[i])%>%
dplyr::select(1:8)%>%
dplyr::slice(1)
D1 <- D1%>%as.data.frame()
D1[1,c(1:2,7:8)] = c("NAS","North America","Aggregates","Aggregates")
D_ <- bind_rows(D_,D1)
}
# Combine D_ and D_Temp
D <- left_join(D_,D_Temp,by = "Series name")%>%pivot_longer(cols = 9:29,names_to = "year",values_to = "value")
Add these to the original Climate table and we keep the related variables
Climate <- bind_rows(Climate,D)
Climate <- Climate%>%
dplyr::filter(`Series code` %in% Related_variables)
# update Regioncode
Regioncode <- append(Regioncode,"NAS")
Replot the previous plot
p <- ggplot(Climate%>%dplyr::filter(`Series code` == Related_variables[1],`Country code` %in% Regioncode))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+xlab("year")+
ggtitle(Related_variables[1])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.3))
Series code is not user friendly, we would use Series name instead
SeriesName = SeriesCode%>%dplyr::select(`Series name`)%>%pull()
try plot
p <- ggplot(Climate%>%dplyr::filter(`Series name` == SeriesName[1],`Country code` %in% c(Incomecode,"WLD")))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+
xlab("year")+
ggtitle(SeriesName[1])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.2))
p <- ggplot(Climate%>%dplyr::filter(`Series name` == SeriesName[2],`Country code` %in% c(Incomecode,"WLD")))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+
xlab("year")+
ggtitle(SeriesName[2])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.2))
p <- ggplot(Climate%>%dplyr::filter(`Series name` == SeriesName[3],`Country code` %in% c(Incomecode,"WLD")))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+
xlab("year")+
ggtitle(SeriesName[3])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.2))
p <- ggplot(Climate%>%dplyr::filter(`Series name` == SeriesName[4],`Country code` %in% c(Incomecode,"WLD")))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+
xlab("year")+
ggtitle(SeriesName[4])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.2))
p <- ggplot(Climate%>%dplyr::filter(`Series name` == SeriesName[5],`Country code` %in% c(Incomecode,"WLD")))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+
xlab("year")+
ggtitle(SeriesName[5])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.2))
p <- ggplot(Climate%>%dplyr::filter(`Series name` == SeriesName[6],`Country code` %in% c(Incomecode,"WLD")))+
aes(x = as.numeric(as.character(year)),y = value,color = `Country name`)+
geom_line()+
xlab("year")+
ggtitle(SeriesName[6])
plotly_build(p)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.2))
We shall see how the data is distributed
TempD <- Climate%>%dplyr::filter(`Series name` == SeriesName[1],!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year %in% c(1992,2008))
p1 <- ggplot(TempD)+
aes(x =log(value),color = `Region`,fill = `Region`)+facet_wrap(as.factor(TempD$year),nrow = 2)+
geom_histogram(bins = 30,aes(y = ..density..),position = "dodge",alpha = .2)+
geom_density(alpha = .3)+ylab("year")+
xlab(paste0("log ",SeriesName[1]))
plotly_build(p1)#%>%layout(legend = list(orientation = 'h',x = 0, y = -0.2))
TempD <- Climate%>%dplyr::filter(`Series name` == SeriesName[2],!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year %in% c(1992,2008))
p1 <- ggplot(TempD)+
aes(x =log(value),color = `Region`,fill = `Region`)+facet_wrap(as.factor(TempD$year),nrow = 2)+
geom_histogram(bins = 30,aes(y = ..density..),position = "dodge",alpha = .2)+
geom_density(alpha = .3)+ylab("year")+
xlab(paste0("log ",SeriesName[2]))
plotly_build(p1)
TempD <- Climate%>%dplyr::filter(`Series name` == SeriesName[3],!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year %in% c(1992,2008))
p1 <- ggplot(TempD)+
aes(x =log(value),color = `Region`,fill = `Region`)+facet_wrap(as.factor(TempD$year),nrow = 2)+
geom_histogram(bins = 30,aes(y = ..density..),position = "dodge",alpha = .2)+
geom_density(alpha = .3)+ylab("year")+
xlab(paste0("log ",SeriesName[3]))
plotly_build(p1)
TempD <- Climate%>%dplyr::filter(`Series name` == SeriesName[4],!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year %in% c(1992,2008))
p1 <- ggplot(TempD)+
aes(x =log(value),color = `Region`,fill = `Region`)+facet_wrap(as.factor(TempD$year),nrow = 2)+
geom_histogram(bins = 30,aes(y = ..density..),position = "dodge",alpha = .2)+
geom_density(alpha = .3)+ylab("year")+
xlab(paste0("log ",SeriesName[4]))
plotly_build(p1)
TempD <- Climate%>%dplyr::filter(`Series name` == SeriesName[5],!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year %in% c(1992,2008))
p1 <- ggplot(TempD)+
aes(x =log(value),color = `Region`,fill = `Region`)+facet_wrap(as.factor(TempD$year),nrow = 2)+
geom_histogram(bins = 30,aes(y = ..density..),position = "dodge",alpha = .2)+
geom_density(alpha = .3)+ylab("year")+
xlab(paste0("log ",SeriesName[5]))
plotly_build(p1)
TempD <- Climate%>%dplyr::filter(`Series name` == SeriesName[6],!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year %in% c(1992,2008))
p1 <- ggplot(TempD)+
aes(x =log(value),color = `Region`,fill = `Region`)+facet_wrap(as.factor(TempD$year),nrow = 2)+
geom_histogram(bins = 30,aes(y = ..density..),position = "dodge",alpha = .2)+
geom_density(alpha = .3)+ylab("year")+
xlab(paste0("log ",SeriesName[6]))
plotly_build(p1)
try pivot series code,log all input
Climate1 <- Climate%>%
dplyr::select(1:3,7:10)%>%
pivot_wider(names_from = `Series code`,values_from = value)%>%
dplyr::mutate_at(.vars = 6:11,.funs = log)
TempD <- Climate1%>%dplyr::filter(!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year ==2005 )
p3 <- ggplot(TempD)+aes(y = `EN.ATM.CO2E.KT`,x = `SP.POP.TOTL`)+geom_point(aes(size = `NY.GDP.MKTP.CD`,color = `Income group`))+scale_size_continuous(range = c(0,4))+xlab("Total Population, size = GDP")+ylab("Total CO2 Emission")
plotly_build(p3)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.3))
TempD <- Climate1%>%dplyr::filter(!`Country code` %in% Incomecode & !`Country code` %in% Regioncode,year ==2005 )
p3 <- ggplot(TempD)+aes(y = `EN.ATM.CO2E.PC`,x = `SP.POP.TOTL`)+geom_point(aes(size = `NY.GDP.MKTP.CD`,color = `Income group`))+scale_size_continuous(range = c(0,4))+xlab("Total Population, size = GDP")+ylab("Total CO2 Emission per Cap")
plotly_build(p3)%>%layout(legend = list(orientation = 'h',x = 0, y = -0.3))
Speculation: Higher GDP, Higher income is related to higher